library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(geosphere)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following object is masked from 'package:base':
## 
##     date
masterdata <- read.csv("new_MASTER_01_data.csv")
#sample_01_data <- read.csv("sample_01_data.csv")
#masterdata$newStartTime <- sample_01_data$starttime
#masterdata$newStopTime <- sample_01_data$stoptime
#write.csv(masterdata,"new_MASTER_01_data.csv")
  
summary(masterdata)
##        X           tripduration       start.station.id
##  Min.   :     1   Min.   :     61.0   519    :  1558  
##  1st Qu.: 51380   1st Qu.:    363.0   497    :  1227  
##  Median :102759   Median :    616.0   3255   :  1200  
##  Mean   :102759   Mean   :    992.7   285    :  1145  
##  3rd Qu.:154138   3rd Qu.:   1081.0   402    :  1125  
##  Max.   :205517   Max.   :2678003.0   435    :  1089  
##                                       (Other):198173  
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  1558   Min.   :40.66          Min.   :-74.03         
##  E 17 St & Broadway   :  1227   1st Qu.:40.72          1st Qu.:-74.00         
##  8 Ave & W 31 St      :  1200   Median :40.74          Median :-73.99         
##  Broadway & E 14 St   :  1145   Mean   :40.74          Mean   :-73.98         
##  Broadway & E 22 St   :  1125   3rd Qu.:40.76          3rd Qu.:-73.97         
##  W 21 St & 6 Ave      :  1089   Max.   :40.86          Max.   :-73.89         
##  (Other)              :198173                                                 
##  end.station.id                end.station.name  end.station.latitude
##  519    :  1604   Pershing Square North:  1604   Min.   :40.66       
##  497    :  1254   E 17 St & Broadway   :  1254   1st Qu.:40.72       
##  402    :  1194   Broadway & E 22 St   :  1194   Median :40.74       
##  3255   :  1169   8 Ave & W 31 St      :  1169   Mean   :40.74       
##  285    :  1157   Broadway & E 14 St   :  1157   3rd Qu.:40.76       
##  426    :  1120   West St & Chambers St:  1120   Max.   :40.86       
##  (Other):198019   (Other)              :198019                       
##  end.station.longitude     bikeid            usertype        birth.year  
##  Min.   :-74.05        Min.   :14529   Customer  : 28805   Min.   :1886  
##  1st Qu.:-74.00        1st Qu.:25323   Subscriber:176712   1st Qu.:1969  
##  Median :-73.99        Median :30947                       Median :1983  
##  Mean   :-73.98        Mean   :29669                       Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35053                       3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                       Max.   :2003  
##                                                                          
##      gender           AWND        AWND_ATTRIBUTES      PRCP      
##  Min.   :0.000   Min.   : 1.120      : 22301      Min.   :0.000  
##  1st Qu.:1.000   1st Qu.: 2.910   ,,W:183216      1st Qu.:0.000  
##  Median :1.000   Median : 4.030                   Median :0.000  
##  Mean   :1.164   Mean   : 4.385                   Mean   :0.106  
##  3rd Qu.:1.000   3rd Qu.: 5.140                   3rd Qu.:0.040  
##  Max.   :2.000   Max.   :12.750                   Max.   :1.830  
##                  NA's   :22301                                   
##   PRCP_ATTRIBUTES        SNOW        SNOW_ATTRIBUTES        SNWD        
##  ,,W,2400 :186524   Min.   :0.000            :   545   Min.   :0.00000  
##  T,,W,2400: 18993   1st Qu.:0.000   ,,W,2400 :201841   1st Qu.:0.00000  
##                     Median :0.000   T,,W,2400:  3131   Median :0.00000  
##                     Mean   :0.019                      Mean   :0.02829  
##                     3rd Qu.:0.000                      3rd Qu.:0.00000  
##                     Max.   :4.000                      Max.   :3.90000  
##                     NA's   :545                                         
##   SNWD_ATTRIBUTES     TAVG         TAVG_ATTRIBUTES      TMAX      
##  ,,W,2400 :204127   Mode:logical   Mode:logical    Min.   :14.00  
##  T,,W,2400:  1390   NA's:205517    NA's:205517     1st Qu.:57.00  
##                                                    Median :71.00  
##                                                    Mean   :68.17  
##                                                    3rd Qu.:81.00  
##                                                    Max.   :95.00  
##                                                                   
##  TMAX_ATTRIBUTES      TMIN       TMIN_ATTRIBUTES      WDF2      
##  ,,W:205517      Min.   : 2.00   ,,W:205517      Min.   : 10.0  
##                  1st Qu.:42.00                   1st Qu.: 60.0  
##                  Median :56.00                   Median :220.0  
##                  Mean   :53.63                   Mean   :182.2  
##                  3rd Qu.:67.00                   3rd Qu.:280.0  
##                  Max.   :82.00                   Max.   :360.0  
##                                                  NA's   :22301  
##  WDF2_ATTRIBUTES      WDF5       WDF5_ATTRIBUTES      WSF2      
##     : 22301      Min.   : 10.0      : 22700      Min.   : 6.90  
##  ,,W:183216      1st Qu.: 70.0   ,,W:182817      1st Qu.:10.10  
##                  Median :220.0                   Median :12.10  
##                  Mean   :183.6                   Mean   :12.81  
##                  3rd Qu.:270.0                   3rd Qu.:15.00  
##                  Max.   :360.0                   Max.   :25.10  
##                  NA's   :22700                   NA's   :22301  
##  WSF2_ATTRIBUTES      WSF5       WSF5_ATTRIBUTES      WT01       
##     : 22301      Min.   :11.00      : 22700      Min.   :1       
##  ,,W:183216      1st Qu.:17.00   ,,W:182817      1st Qu.:1       
##                  Median :19.90                   Median :1       
##                  Mean   :20.77                   Mean   :1       
##                  3rd Qu.:23.00                   3rd Qu.:1       
##                  Max.   :40.90                   Max.   :1       
##                  NA's   :22700                   NA's   :123167  
##  WT01_ATTRIBUTES      WT02        WT02_ATTRIBUTES      WT03       
##     :123167      Min.   :1           :201690      Min.   :1       
##  ,,W: 82350      1st Qu.:1        ,,W:  3827      1st Qu.:1       
##                  Median :1                        Median :1       
##                  Mean   :1                        Mean   :1       
##                  3rd Qu.:1                        3rd Qu.:1       
##                  Max.   :1                        Max.   :1       
##                  NA's   :201690                   NA's   :186419  
##  WT03_ATTRIBUTES      WT06        WT06_ATTRIBUTES      WT08       
##     :186419      Min.   :1           :204101      Min.   :1       
##  ,,W: 19098      1st Qu.:1        ,,W:  1416      1st Qu.:1       
##                  Median :1                        Median :1       
##                  Mean   :1                        Mean   :1       
##                  3rd Qu.:1                        3rd Qu.:1       
##                  Max.   :1                        Max.   :1       
##                  NA's   :204101                   NA's   :172801  
##  WT08_ATTRIBUTES                   newStartTime   
##     :172801      2019-03-01 17:41:27.7210:     2  
##  ,,W: 32716      2019-07-31 17:48:23.5580:     2  
##                  2019-01-01 00:35:03.5980:     1  
##                  2019-01-01 01:14:01.5150:     1  
##                  2019-01-01 01:59:10.1080:     1  
##                  2019-01-01 02:47:03.7040:     1  
##                  (Other)                 :205509  
##                    newStopTime    
##  2019-05-28 09:10:01.3380:     2  
##  2019-07-12 08:43:08.3900:     2  
##  2019-01-01 00:38:10.6250:     1  
##  2019-01-01 01:58:41.1290:     1  
##  2019-01-01 02:12:34.9820:     1  
##  2019-01-01 02:55:16.4380:     1  
##  (Other)                 :205509
str(masterdata)
## 'data.frame':    205517 obs. of  48 variables:
##  $ X                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ tripduration           : int  110 1067 325 552 282 1150 178 777 423 144 ...
##  $ start.station.id       : Factor w/ 906 levels "116","119","120",..: 253 838 98 855 645 850 250 873 214 16 ...
##  $ start.station.name     : Factor w/ 908 levels "1 Ave & E 110 St",..: 868 208 409 199 747 717 785 126 155 205 ...
##  $ start.station.latitude : num  40.8 40.8 40.8 40.7 40.7 ...
##  $ start.station.longitude: num  -74 -74 -74 -74 -74 ...
##  $ end.station.id         : Factor w/ 906 levels "116","119","120",..: 213 586 543 787 628 71 423 421 273 16 ...
##  $ end.station.name       : Factor w/ 908 levels "1 Ave & E 110 St",..: 863 200 390 193 356 677 643 398 289 207 ...
##  $ end.station.latitude   : num  40.8 40.7 40.8 40.7 40.7 ...
##  $ end.station.longitude  : num  -74 -74 -74 -74 -74 ...
##  $ bikeid                 : int  38891 38269 14654 15101 32868 30584 32492 30258 36783 36111 ...
##  $ usertype               : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 1 ...
##  $ birth.year             : int  1989 1965 1990 1977 1996 1988 1954 1989 1977 1961 ...
##  $ gender                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ AWND                   : num  3.36 5.37 2.91 1.79 2.91 2.91 2.68 2.91 7.83 4.47 ...
##  $ AWND_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ PRCP                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PRCP_ATTRIBUTES        : Factor w/ 2 levels ",,W,2400","T,,W,2400": 2 2 1 2 1 1 1 1 1 1 ...
##  $ SNOW                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNOW_ATTRIBUTES        : Factor w/ 3 levels "",",,W,2400",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ SNWD                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNWD_ATTRIBUTES        : Factor w/ 2 levels ",,W,2400","T,,W,2400": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TAVG                   : logi  NA NA NA NA NA NA ...
##  $ TAVG_ATTRIBUTES        : logi  NA NA NA NA NA NA ...
##  $ TMAX                   : int  87 39 70 87 85 85 80 88 49 60 ...
##  $ TMAX_ATTRIBUTES        : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TMIN                   : int  73 32 52 75 72 68 63 75 33 38 ...
##  $ TMIN_ATTRIBUTES        : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WDF2                   : int  70 250 40 60 220 290 150 140 10 260 ...
##  $ WDF2_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WDF5                   : int  40 220 40 70 220 290 150 140 360 260 ...
##  $ WDF5_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WSF2                   : num  8.9 13 8.9 10.1 12.1 8.9 8.9 8.9 16.1 13 ...
##  $ WSF2_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WSF5                   : num  17 19.9 13 13 19 15 16.1 15 25.1 23 ...
##  $ WSF5_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WT01                   : int  1 NA NA NA NA NA NA NA NA NA ...
##  $ WT01_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 1 1 1 1 1 1 1 1 1 ...
##  $ WT02                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT02_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT03                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT03_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT06                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT06_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT08                   : int  NA 1 NA NA NA NA 1 NA NA NA ...
##  $ WT08_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 2 1 1 1 1 2 1 1 1 ...
##  $ newStartTime           : Factor w/ 205515 levels "2019-01-01 00:35:03.5980",..: 124947 197101 150074 92205 108911 100289 70041 126789 29559 28200 ...
##  $ newStopTime            : Factor w/ 205515 levels "2019-01-01 00:38:10.6250",..: 124930 197107 150056 92205 108899 100292 70035 126787 29556 28200 ...
#is the date data gone?
#summary(read.csv("sample_01_data.csv"))
#convert columns to factors as needed

masterdata$bikeid <- as.factor(masterdata$bikeid)
masterdata$gender <- as.factor(masterdata$gender)

masterdata$gender <- as.factor(ifelse(masterdata$gender == "0", "Unknown", ifelse(masterdata$gender == "1", "Male", "Female")))
masterdata$X <- NULL
masterdata$starttime <- NULL
masterdata$stoptime <- NULL

masterdata$newStartTime = as.POSIXct(strptime(masterdata$newStartTime, "%Y-%m-%d %H:%M:%S"))
masterdata$newStopTime = as.POSIXct(strptime(masterdata$newStopTime, "%Y-%m-%d %H:%M:%S"))

masterdata$newStartDate <- as.Date(masterdata$newStartTime)
masterdata$newStopDate <- as.Date(masterdata$newStopTime)


#distance
masterstart <- as.data.frame(matrix(nrow = 205517, ncol = 0))
masterstart$startlong <- as.numeric(masterdata$start.station.longitude)
masterstart$startlat <- as.numeric(masterdata$start.station.latitude)

masterend <- as.data.frame(matrix(nrow = 205517, ncol = 0))
masterend$endlong <- masterdata$end.station.longitude
masterend$endlat <- masterdata$end.station.latitude

masterdata$distanceH <- distHaversine(masterstart, masterend, r=6378137)

masterend <- NULL
masterstart <- NULL

str(masterdata)
## 'data.frame':    205517 obs. of  50 variables:
##  $ tripduration           : int  110 1067 325 552 282 1150 178 777 423 144 ...
##  $ start.station.id       : Factor w/ 906 levels "116","119","120",..: 253 838 98 855 645 850 250 873 214 16 ...
##  $ start.station.name     : Factor w/ 908 levels "1 Ave & E 110 St",..: 868 208 409 199 747 717 785 126 155 205 ...
##  $ start.station.latitude : num  40.8 40.8 40.8 40.7 40.7 ...
##  $ start.station.longitude: num  -74 -74 -74 -74 -74 ...
##  $ end.station.id         : Factor w/ 906 levels "116","119","120",..: 213 586 543 787 628 71 423 421 273 16 ...
##  $ end.station.name       : Factor w/ 908 levels "1 Ave & E 110 St",..: 863 200 390 193 356 677 643 398 289 207 ...
##  $ end.station.latitude   : num  40.8 40.7 40.8 40.7 40.7 ...
##  $ end.station.longitude  : num  -74 -74 -74 -74 -74 ...
##  $ bikeid                 : Factor w/ 19094 levels "14529","14530",..: 16053 15465 97 422 11882 9967 11539 9673 14577 14187 ...
##  $ usertype               : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 1 ...
##  $ birth.year             : int  1989 1965 1990 1977 1996 1988 1954 1989 1977 1961 ...
##  $ gender                 : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ AWND                   : num  3.36 5.37 2.91 1.79 2.91 2.91 2.68 2.91 7.83 4.47 ...
##  $ AWND_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ PRCP                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PRCP_ATTRIBUTES        : Factor w/ 2 levels ",,W,2400","T,,W,2400": 2 2 1 2 1 1 1 1 1 1 ...
##  $ SNOW                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNOW_ATTRIBUTES        : Factor w/ 3 levels "",",,W,2400",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ SNWD                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNWD_ATTRIBUTES        : Factor w/ 2 levels ",,W,2400","T,,W,2400": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TAVG                   : logi  NA NA NA NA NA NA ...
##  $ TAVG_ATTRIBUTES        : logi  NA NA NA NA NA NA ...
##  $ TMAX                   : int  87 39 70 87 85 85 80 88 49 60 ...
##  $ TMAX_ATTRIBUTES        : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ TMIN                   : int  73 32 52 75 72 68 63 75 33 38 ...
##  $ TMIN_ATTRIBUTES        : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WDF2                   : int  70 250 40 60 220 290 150 140 10 260 ...
##  $ WDF2_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WDF5                   : int  40 220 40 70 220 290 150 140 360 260 ...
##  $ WDF5_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WSF2                   : num  8.9 13 8.9 10.1 12.1 8.9 8.9 8.9 16.1 13 ...
##  $ WSF2_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WSF5                   : num  17 19.9 13 13 19 15 16.1 15 25.1 23 ...
##  $ WSF5_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ WT01                   : int  1 NA NA NA NA NA NA NA NA NA ...
##  $ WT01_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 2 1 1 1 1 1 1 1 1 1 ...
##  $ WT02                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT02_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT03                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT03_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT06                   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WT06_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WT08                   : int  NA 1 NA NA NA NA 1 NA NA NA ...
##  $ WT08_ATTRIBUTES        : Factor w/ 2 levels "",",,W": 1 2 1 1 1 1 2 1 1 1 ...
##  $ newStartTime           : POSIXct, format: "2019-08-17 13:10:36" "2019-12-04 18:48:45" ...
##  $ newStopTime            : POSIXct, format: "2019-08-17 13:12:27" "2019-12-04 19:06:32" ...
##  $ newStartDate           : Date, format: "2019-08-17" "2019-12-04" ...
##  $ newStopDate            : Date, format: "2019-08-17" "2019-12-05" ...
##  $ distanceH              : num  413 2567 922 657 1099 ...
summary(masterdata)
##   tripduration       start.station.id             start.station.name
##  Min.   :     61.0   519    :  1558   Pershing Square North:  1558  
##  1st Qu.:    363.0   497    :  1227   E 17 St & Broadway   :  1227  
##  Median :    616.0   3255   :  1200   8 Ave & W 31 St      :  1200  
##  Mean   :    992.7   285    :  1145   Broadway & E 14 St   :  1145  
##  3rd Qu.:   1081.0   402    :  1125   Broadway & E 22 St   :  1125  
##  Max.   :2678003.0   435    :  1089   W 21 St & 6 Ave      :  1089  
##                      (Other):198173   (Other)              :198173  
##  start.station.latitude start.station.longitude end.station.id  
##  Min.   :40.66          Min.   :-74.03          519    :  1604  
##  1st Qu.:40.72          1st Qu.:-74.00          497    :  1254  
##  Median :40.74          Median :-73.99          402    :  1194  
##  Mean   :40.74          Mean   :-73.98          3255   :  1169  
##  3rd Qu.:40.76          3rd Qu.:-73.97          285    :  1157  
##  Max.   :40.86          Max.   :-73.89          426    :  1120  
##                                                 (Other):198019  
##               end.station.name  end.station.latitude end.station.longitude
##  Pershing Square North:  1604   Min.   :40.66        Min.   :-74.05       
##  E 17 St & Broadway   :  1254   1st Qu.:40.72        1st Qu.:-74.00       
##  Broadway & E 22 St   :  1194   Median :40.74        Median :-73.99       
##  8 Ave & W 31 St      :  1169   Mean   :40.74        Mean   :-73.98       
##  Broadway & E 14 St   :  1157   3rd Qu.:40.76        3rd Qu.:-73.97       
##  West St & Chambers St:  1120   Max.   :40.86        Max.   :-73.89       
##  (Other)              :198019                                             
##      bikeid             usertype        birth.year       gender      
##  35306  :    44   Customer  : 28805   Min.   :1886   Female : 49419  
##  34019  :    41   Subscriber:176712   1st Qu.:1969   Male   :140370  
##  34958  :    41                       Median :1983   Unknown: 15728  
##  35029  :    41                       Mean   :1980                   
##  35324  :    41                       3rd Qu.:1990                   
##  33885  :    40                       Max.   :2003                   
##  (Other):205269                                                      
##       AWND        AWND_ATTRIBUTES      PRCP        PRCP_ATTRIBUTES  
##  Min.   : 1.120      : 22301      Min.   :0.000   ,,W,2400 :186524  
##  1st Qu.: 2.910   ,,W:183216      1st Qu.:0.000   T,,W,2400: 18993  
##  Median : 4.030                   Median :0.000                     
##  Mean   : 4.385                   Mean   :0.106                     
##  3rd Qu.: 5.140                   3rd Qu.:0.040                     
##  Max.   :12.750                   Max.   :1.830                     
##  NA's   :22301                                                      
##       SNOW        SNOW_ATTRIBUTES        SNWD          SNWD_ATTRIBUTES  
##  Min.   :0.000            :   545   Min.   :0.00000   ,,W,2400 :204127  
##  1st Qu.:0.000   ,,W,2400 :201841   1st Qu.:0.00000   T,,W,2400:  1390  
##  Median :0.000   T,,W,2400:  3131   Median :0.00000                     
##  Mean   :0.019                      Mean   :0.02829                     
##  3rd Qu.:0.000                      3rd Qu.:0.00000                     
##  Max.   :4.000                      Max.   :3.90000                     
##  NA's   :545                                                            
##    TAVG         TAVG_ATTRIBUTES      TMAX       TMAX_ATTRIBUTES      TMIN      
##  Mode:logical   Mode:logical    Min.   :14.00   ,,W:205517      Min.   : 2.00  
##  NA's:205517    NA's:205517     1st Qu.:57.00                   1st Qu.:42.00  
##                                 Median :71.00                   Median :56.00  
##                                 Mean   :68.17                   Mean   :53.63  
##                                 3rd Qu.:81.00                   3rd Qu.:67.00  
##                                 Max.   :95.00                   Max.   :82.00  
##                                                                                
##  TMIN_ATTRIBUTES      WDF2       WDF2_ATTRIBUTES      WDF5      
##  ,,W:205517      Min.   : 10.0      : 22301      Min.   : 10.0  
##                  1st Qu.: 60.0   ,,W:183216      1st Qu.: 70.0  
##                  Median :220.0                   Median :220.0  
##                  Mean   :182.2                   Mean   :183.6  
##                  3rd Qu.:280.0                   3rd Qu.:270.0  
##                  Max.   :360.0                   Max.   :360.0  
##                  NA's   :22301                   NA's   :22700  
##  WDF5_ATTRIBUTES      WSF2       WSF2_ATTRIBUTES      WSF5      
##     : 22700      Min.   : 6.90      : 22301      Min.   :11.00  
##  ,,W:182817      1st Qu.:10.10   ,,W:183216      1st Qu.:17.00  
##                  Median :12.10                   Median :19.90  
##                  Mean   :12.81                   Mean   :20.77  
##                  3rd Qu.:15.00                   3rd Qu.:23.00  
##                  Max.   :25.10                   Max.   :40.90  
##                  NA's   :22301                   NA's   :22700  
##  WSF5_ATTRIBUTES      WT01        WT01_ATTRIBUTES      WT02       
##     : 22700      Min.   :1           :123167      Min.   :1       
##  ,,W:182817      1st Qu.:1        ,,W: 82350      1st Qu.:1       
##                  Median :1                        Median :1       
##                  Mean   :1                        Mean   :1       
##                  3rd Qu.:1                        3rd Qu.:1       
##                  Max.   :1                        Max.   :1       
##                  NA's   :123167                   NA's   :201690  
##  WT02_ATTRIBUTES      WT03        WT03_ATTRIBUTES      WT06       
##     :201690      Min.   :1           :186419      Min.   :1       
##  ,,W:  3827      1st Qu.:1        ,,W: 19098      1st Qu.:1       
##                  Median :1                        Median :1       
##                  Mean   :1                        Mean   :1       
##                  3rd Qu.:1                        3rd Qu.:1       
##                  Max.   :1                        Max.   :1       
##                  NA's   :186419                   NA's   :204101  
##  WT06_ATTRIBUTES      WT08        WT08_ATTRIBUTES  newStartTime                
##     :204101      Min.   :1           :172801      Min.   :2019-01-01 00:35:03  
##  ,,W:  1416      1st Qu.:1        ,,W: 32716      1st Qu.:2019-05-03 06:30:28  
##                  Median :1                        Median :2019-07-18 16:48:45  
##                  Mean   :1                        Mean   :2019-07-12 13:31:09  
##                  3rd Qu.:1                        3rd Qu.:2019-09-23 18:04:58  
##                  Max.   :1                        Max.   :2019-12-31 23:33:21  
##                  NA's   :172801                                                
##   newStopTime                   newStartDate         newStopDate        
##  Min.   :2019-01-01 00:38:10   Min.   :2019-01-01   Min.   :2019-01-01  
##  1st Qu.:2019-05-03 06:49:10   1st Qu.:2019-05-03   1st Qu.:2019-05-03  
##  Median :2019-07-18 16:59:56   Median :2019-07-18   Median :2019-07-18  
##  Mean   :2019-07-12 13:47:42   Mean   :2019-07-12   Mean   :2019-07-12  
##  3rd Qu.:2019-09-23 18:18:30   3rd Qu.:2019-09-23   3rd Qu.:2019-09-23  
##  Max.   :2020-01-02 09:26:42   Max.   :2020-01-01   Max.   :2020-01-02  
##                                                                         
##    distanceH      
##  Min.   :    0.0  
##  1st Qu.:  825.9  
##  Median : 1375.9  
##  Mean   : 1779.9  
##  3rd Qu.: 2305.2  
##  Max.   :13812.2  
## 
#average precip per month
#facet wrap month, against count of number of rides as y variable. x = average precip, colour = gender
#categorize birth year into age groups. Facet wrap or colour by age groups
#rain vs trip length (end time - start time)
#bar plot of busy/not-busy stations on rainy days
#randomly try the same plot across a few select dates


ggplot(data=masterdata, aes(x=newStartDate, y=tripduration, colour=gender)) + geom_point()

#newStartDate vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_violin()
## Warning: position_dodge requires non-overlapping x intervals

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_boxplot()

#newStartDate vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_boxplot()

#prcp vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_boxplot()

#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_boxplot()

#prcp vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_boxplot()

#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_boxplot()